{
 "cells": [
  {
   "cell_type": "raw",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import bs4 as bs\n",
    "import urllib2\n",
    "import re\n",
    "import numpy as np\n",
    "import uuid\n",
    "import pandas as pd\n",
    "from os import walk\n",
    "import json\n",
    "import sys\n",
    "import time\n",
    "reload(sys)\n",
    "sys.setdefaultencoding('utf-8')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### combine all csv's and remove duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>URL</th>\n",
       "      <th>Description</th>\n",
       "      <th>Details</th>\n",
       "      <th>ShortDetails</th>\n",
       "      <th>Resource</th>\n",
       "      <th>Type</th>\n",
       "      <th>Identifiers</th>\n",
       "      <th>Db</th>\n",
       "      <th>EntrezUID</th>\n",
       "      <th>Properties</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>development and assessment of a lysophospholip...</td>\n",
       "      <td>/pubmed/28819110</td>\n",
       "      <td>Long NP, Lim DK, Mo C, Kim G, Kwon SW.</td>\n",
       "      <td>Sci Rep. 2017 Aug 17;7(1):8552. doi: 10.1038/s...</td>\n",
       "      <td>Sci Rep.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28819110 | PMCID:PMC5561257</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28819110</td>\n",
       "      <td>create date:2017/08/19 | first author:Long NP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>omni-polya: a method and tool for accurate rec...</td>\n",
       "      <td>/pubmed/28810905</td>\n",
       "      <td>Magana-Mora A, Kalkatawi M, Bajic VB.</td>\n",
       "      <td>BMC Genomics. 2017 Aug 15;18(1):620. doi: 10.1...</td>\n",
       "      <td>BMC Genomics.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28810905 | PMCID:PMC5558757</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28810905</td>\n",
       "      <td>create date:2017/08/16 | first author:Magana-M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>vps35-related parkinson disease</td>\n",
       "      <td>/pubmed/28796472</td>\n",
       "      <td>Deutschländer A, Ross OA, Wszolek ZK.</td>\n",
       "      <td>2017 Aug 10. In: Pagon RA, Adam MP, Ardinger H...</td>\n",
       "      <td>GeneReviews&lt;sup&gt;®&lt;/sup&gt;.  1993</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28796472</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28796472</td>\n",
       "      <td>create date:2017/08/11 | first author:Deutschl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>inicu - integrated neonatal care unit: capturi...</td>\n",
       "      <td>/pubmed/28748430</td>\n",
       "      <td>Singh H, Yadav G, Mallaiah R, Joshi P, Joshi V...</td>\n",
       "      <td>J Med Syst. 2017 Aug;41(8):132. doi: 10.1007/s...</td>\n",
       "      <td>J Med Syst.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28748430 | PMCID:PMC5529490</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28748430</td>\n",
       "      <td>create date:2017/07/28 | first author:Singh H</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>deepbipolar: identifying genomic mutations for...</td>\n",
       "      <td>/pubmed/28600868</td>\n",
       "      <td>Laksshman S, Bhat RR, Viswanath V, Li X.</td>\n",
       "      <td>Hum Mutat. 2017 Sep;38(9):1217-1224. doi: 10.1...</td>\n",
       "      <td>Hum Mutat.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28600868</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28600868</td>\n",
       "      <td>create date:2017/06/11 | first author:Laksshman S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Title               URL  \\\n",
       "0  development and assessment of a lysophospholip...  /pubmed/28819110   \n",
       "1  omni-polya: a method and tool for accurate rec...  /pubmed/28810905   \n",
       "2                    vps35-related parkinson disease  /pubmed/28796472   \n",
       "3  inicu - integrated neonatal care unit: capturi...  /pubmed/28748430   \n",
       "4  deepbipolar: identifying genomic mutations for...  /pubmed/28600868   \n",
       "\n",
       "                                         Description  \\\n",
       "0             Long NP, Lim DK, Mo C, Kim G, Kwon SW.   \n",
       "1              Magana-Mora A, Kalkatawi M, Bajic VB.   \n",
       "2              Deutschländer A, Ross OA, Wszolek ZK.   \n",
       "3  Singh H, Yadav G, Mallaiah R, Joshi P, Joshi V...   \n",
       "4           Laksshman S, Bhat RR, Viswanath V, Li X.   \n",
       "\n",
       "                                             Details  \\\n",
       "0  Sci Rep. 2017 Aug 17;7(1):8552. doi: 10.1038/s...   \n",
       "1  BMC Genomics. 2017 Aug 15;18(1):620. doi: 10.1...   \n",
       "2  2017 Aug 10. In: Pagon RA, Adam MP, Ardinger H...   \n",
       "3  J Med Syst. 2017 Aug;41(8):132. doi: 10.1007/s...   \n",
       "4  Hum Mutat. 2017 Sep;38(9):1217-1224. doi: 10.1...   \n",
       "\n",
       "                     ShortDetails Resource      Type  \\\n",
       "0                  Sci Rep.  2017   PubMed  citation   \n",
       "1             BMC Genomics.  2017   PubMed  citation   \n",
       "2  GeneReviews<sup>®</sup>.  1993   PubMed  citation   \n",
       "3               J Med Syst.  2017   PubMed  citation   \n",
       "4                Hum Mutat.  2017   PubMed  citation   \n",
       "\n",
       "                        Identifiers      Db EntrezUID  \\\n",
       "0  PMID:28819110 | PMCID:PMC5561257  pubmed  28819110   \n",
       "1  PMID:28810905 | PMCID:PMC5558757  pubmed  28810905   \n",
       "2                     PMID:28796472  pubmed  28796472   \n",
       "3  PMID:28748430 | PMCID:PMC5529490  pubmed  28748430   \n",
       "4                     PMID:28600868  pubmed  28600868   \n",
       "\n",
       "                                          Properties  \n",
       "0      create date:2017/08/19 | first author:Long NP  \n",
       "1  create date:2017/08/16 | first author:Magana-M...  \n",
       "2  create date:2017/08/11 | first author:Deutschl...  \n",
       "3      create date:2017/07/28 | first author:Singh H  \n",
       "4  create date:2017/06/11 | first author:Laksshman S  "
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = '/home/ahmed/Dropbox/DFCI/14_zoo/scrap-science/pubmed/raw_result'\n",
    "dfs = []\n",
    "for (_, _, filenames) in walk(path):\n",
    "    for filename in filenames:\n",
    "        fullPath = path+'/'+filename\n",
    "        print fullPath\n",
    "        dfs.append( pd.read_csv(fullPath, index_col=False,header=0) )\n",
    "df = pd.concat(dfs)\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "print df.shape\n",
    "# all titles to lower, strip, remove periods and remove double spaces\n",
    "df['Title'] = df.Title.str.lower()\n",
    "df['Title'] = df.Title.str.strip()\n",
    "df['Title'] = df.Title.str.replace('.','')\n",
    "df['Title'] = df.Title.str.replace('  ',' ')\n",
    "# remove duplicates by title\n",
    "df = df.drop_duplicates(subset='Title', keep=\"first\")\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "print df.shape\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>URL</th>\n",
       "      <th>Description</th>\n",
       "      <th>Details</th>\n",
       "      <th>ShortDetails</th>\n",
       "      <th>Resource</th>\n",
       "      <th>Type</th>\n",
       "      <th>Identifiers</th>\n",
       "      <th>Db</th>\n",
       "      <th>EntrezUID</th>\n",
       "      <th>Properties</th>\n",
       "      <th>abstract</th>\n",
       "      <th>email</th>\n",
       "      <th>keywords</th>\n",
       "      <th>fullURL</th>\n",
       "      <th>source</th>\n",
       "      <th>year</th>\n",
       "      <th>key</th>\n",
       "      <th>use</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>development and assessment of a lysophospholip...</td>\n",
       "      <td>/pubmed/28819110</td>\n",
       "      <td>Long NP, Lim DK, Mo C, Kim G, Kwon SW.</td>\n",
       "      <td>Sci Rep. 2017 Aug 17;7(1):8552. doi: 10.1038/s...</td>\n",
       "      <td>Sci Rep.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28819110 | PMCID:PMC5561257</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28819110</td>\n",
       "      <td>create date:2017/08/19 | first author:Long NP</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>pubmed</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>omni-polya: a method and tool for accurate rec...</td>\n",
       "      <td>/pubmed/28810905</td>\n",
       "      <td>Magana-Mora A, Kalkatawi M, Bajic VB.</td>\n",
       "      <td>BMC Genomics. 2017 Aug 15;18(1):620. doi: 10.1...</td>\n",
       "      <td>BMC Genomics.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28810905 | PMCID:PMC5558757</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28810905</td>\n",
       "      <td>create date:2017/08/16 | first author:Magana-M...</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>pubmed</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>vps35-related parkinson disease</td>\n",
       "      <td>/pubmed/28796472</td>\n",
       "      <td>Deutschländer A, Ross OA, Wszolek ZK.</td>\n",
       "      <td>2017 Aug 10. In: Pagon RA, Adam MP, Ardinger H...</td>\n",
       "      <td>GeneReviews&lt;sup&gt;®&lt;/sup&gt;.  1993</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28796472</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28796472</td>\n",
       "      <td>create date:2017/08/11 | first author:Deutschl...</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>pubmed</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>inicu - integrated neonatal care unit: capturi...</td>\n",
       "      <td>/pubmed/28748430</td>\n",
       "      <td>Singh H, Yadav G, Mallaiah R, Joshi P, Joshi V...</td>\n",
       "      <td>J Med Syst. 2017 Aug;41(8):132. doi: 10.1007/s...</td>\n",
       "      <td>J Med Syst.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28748430 | PMCID:PMC5529490</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28748430</td>\n",
       "      <td>create date:2017/07/28 | first author:Singh H</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>pubmed</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Title               URL  \\\n",
       "0  development and assessment of a lysophospholip...  /pubmed/28819110   \n",
       "1  omni-polya: a method and tool for accurate rec...  /pubmed/28810905   \n",
       "2                    vps35-related parkinson disease  /pubmed/28796472   \n",
       "3  inicu - integrated neonatal care unit: capturi...  /pubmed/28748430   \n",
       "\n",
       "                                         Description  \\\n",
       "0             Long NP, Lim DK, Mo C, Kim G, Kwon SW.   \n",
       "1              Magana-Mora A, Kalkatawi M, Bajic VB.   \n",
       "2              Deutschländer A, Ross OA, Wszolek ZK.   \n",
       "3  Singh H, Yadav G, Mallaiah R, Joshi P, Joshi V...   \n",
       "\n",
       "                                             Details  \\\n",
       "0  Sci Rep. 2017 Aug 17;7(1):8552. doi: 10.1038/s...   \n",
       "1  BMC Genomics. 2017 Aug 15;18(1):620. doi: 10.1...   \n",
       "2  2017 Aug 10. In: Pagon RA, Adam MP, Ardinger H...   \n",
       "3  J Med Syst. 2017 Aug;41(8):132. doi: 10.1007/s...   \n",
       "\n",
       "                     ShortDetails Resource      Type  \\\n",
       "0                  Sci Rep.  2017   PubMed  citation   \n",
       "1             BMC Genomics.  2017   PubMed  citation   \n",
       "2  GeneReviews<sup>®</sup>.  1993   PubMed  citation   \n",
       "3               J Med Syst.  2017   PubMed  citation   \n",
       "\n",
       "                        Identifiers      Db EntrezUID  \\\n",
       "0  PMID:28819110 | PMCID:PMC5561257  pubmed  28819110   \n",
       "1  PMID:28810905 | PMCID:PMC5558757  pubmed  28810905   \n",
       "2                     PMID:28796472  pubmed  28796472   \n",
       "3  PMID:28748430 | PMCID:PMC5529490  pubmed  28748430   \n",
       "\n",
       "                                          Properties abstract email keywords  \\\n",
       "0      create date:2017/08/19 | first author:Long NP                           \n",
       "1  create date:2017/08/16 | first author:Magana-M...                           \n",
       "2  create date:2017/08/11 | first author:Deutschl...                           \n",
       "3      create date:2017/07/28 | first author:Singh H                           \n",
       "\n",
       "  fullURL  source  year key  use  \n",
       "0          pubmed     0        0  \n",
       "1          pubmed     0        0  \n",
       "2          pubmed     0        0  \n",
       "3          pubmed     0        0  "
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add these columns\n",
    "df['abstract']=\"\"\n",
    "df['email']=\"\"\n",
    "df['keywords']=\"\"\n",
    "df['fullURL']=\"\"\n",
    "df['source']=['pubmed' for x in range(df.shape[0])]\n",
    "df['year']=np.zeros(df.shape[0],dtype=np.int)\n",
    "df['key']=\"\"\n",
    "\n",
    "#\n",
    "df['use']=np.zeros(df.shape[0],dtype=np.int)\n",
    "#archive\n",
    "print df.shape\n",
    "df.head(n=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "years = range(2014,3000)\n",
    "yearsToInclude = [ str(x) for x in years]\n",
    "\n",
    "for i in range(995 , df.shape[0] ):   \n",
    "    \n",
    "    # get url\n",
    "    url = 'https://www.ncbi.nlm.nih.gov/' + df.iloc[i].URL\n",
    "    try:\n",
    "        # fetch url\n",
    "        response = urllib2.urlopen(url)\n",
    "        # convert to bs\n",
    "        soup = bs.BeautifulSoup(response,\"html\")\n",
    "        \n",
    "        # 0 set fullURL\n",
    "        df = df.set_value(i, \"fullURL\", url )\n",
    "\n",
    "        # 1 get abstract\n",
    "        abstract = soup.findAll(\"div\",{\"class\":\"abstr\"})\n",
    "        if len(abstract) == 1:\n",
    "            df = df.set_value(i, \"abstract\", str(abstract[0]).replace('\"',\"'\") )\n",
    "\n",
    "        # 2 get email\n",
    "        afflist = soup.findAll(\"div\", { \"class\" : \"afflist\" })\n",
    "        if len(afflist) == 1:\n",
    "            email = re.search(r'[\\w\\.-]+@[\\w\\.-]+', str(afflist[0]) )\n",
    "            if email:\n",
    "                email = email.group(0)\n",
    "                if email[-1] == '.':\n",
    "                    email = email[:-1]\n",
    "                df = df.set_value(i, \"email\", email)\n",
    "\n",
    "\n",
    "        # 3 get keywords\n",
    "        keywords = soup.findAll(\"div\", { \"class\" : \"keywords\" })\n",
    "        if len(keywords) == 1:\n",
    "            df = df.set_value(i, \"keywords\", str(keywords[0].p.text).replace('\"',\"'\") )\n",
    "            \n",
    "        print i\n",
    "        \n",
    "        # 4 set use\n",
    "        if any(word in df.iloc[i].ShortDetails  for word in yearsToInclude):\n",
    "            df = df.set_value(i, \"use\", 1)\n",
    "            \n",
    "        # 5 set year\n",
    "        df = df.set_value(i, \"year\", int( df.iloc[i].ShortDetails[-4:] ))\n",
    "        \n",
    "        # 6 set key\n",
    "        df = df.set_value(i, \"key\", str(uuid.uuid4()) )\n",
    "            \n",
    "        time.sleep(2)\n",
    "        df.to_csv(\"pubmed_temp.csv\")\n",
    "        \n",
    "    except urllib2.HTTPError:\n",
    "        print str(i) + \" no http\"\n",
    "        pass\n",
    "\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## remove use = 0 , to make manual check easier.. (should reset_index but i didnt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# temp\n",
    "df = pd.DataFrame.from_csv('/home/ahmed/Dropbox/DFCI/14_zoo/scrap-science/pubmed/pubmed_temp.csv')\n",
    "print df.shape\n",
    "df = df [df.use == 1]\n",
    "print df.shape\n",
    "df = df [df.year >= 2014]\n",
    "print df.shape\n",
    "df.to_csv(\"pubmed_dirty.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load final and try to add more emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>Description</th>\n",
       "      <th>Details</th>\n",
       "      <th>ShortDetails</th>\n",
       "      <th>Resource</th>\n",
       "      <th>Type</th>\n",
       "      <th>Identifiers</th>\n",
       "      <th>Db</th>\n",
       "      <th>EntrezUID</th>\n",
       "      <th>Properties</th>\n",
       "      <th>abstract</th>\n",
       "      <th>email</th>\n",
       "      <th>keywords</th>\n",
       "      <th>fullURL</th>\n",
       "      <th>source</th>\n",
       "      <th>year</th>\n",
       "      <th>key</th>\n",
       "      <th>use</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Title</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>deepbipolar: identifying genomic mutations for bipolar disorder via deep learning</th>\n",
       "      <td>/pubmed/28600868</td>\n",
       "      <td>Laksshman S, Bhat RR, Viswanath V, Li X.</td>\n",
       "      <td>Hum Mutat. 2017 Sep;38(9):1217-1224. doi: 10.1...</td>\n",
       "      <td>Hum Mutat.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28600868</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28600868</td>\n",
       "      <td>create date:2017/06/11 | first author:Laksshman S</td>\n",
       "      <td>&lt;div class='abstr'&gt;&lt;h3&gt;Abstract&lt;/h3&gt;&lt;div class...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>bipolar disorder; convolutional neural network...</td>\n",
       "      <td>https://www.ncbi.nlm.nih.gov//pubmed/28600868</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>2017</td>\n",
       "      <td>c3510786-340b-45d7-9272-eee0c8960971</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>precision medicine for heart failure with preserved ejection fraction: an overview</th>\n",
       "      <td>/pubmed/28585183</td>\n",
       "      <td>Shah SJ.</td>\n",
       "      <td>J Cardiovasc Transl Res. 2017 Jun;10(3):233-24...</td>\n",
       "      <td>J Cardiovasc Transl Res.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28585183 | PMCID:PMC5540576</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28585183</td>\n",
       "      <td>create date:2017/06/07 | first author:Shah SJ</td>\n",
       "      <td>&lt;div class='abstr'&gt;&lt;h3&gt;Abstract&lt;/h3&gt;&lt;div class...</td>\n",
       "      <td>sanjiv.shah@northwestern.edu</td>\n",
       "      <td>Cluster analysis; Heart failure with preserved...</td>\n",
       "      <td>https://www.ncbi.nlm.nih.gov//pubmed/28585183</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>2017</td>\n",
       "      <td>640584a9-ad0c-46cf-97f9-575c47f80b99</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>deep learning for healthcare: review, opportunities and challenges</th>\n",
       "      <td>/pubmed/28481991</td>\n",
       "      <td>Miotto R, Wang F, Wang S, Jiang X, Dudley JT.</td>\n",
       "      <td>Brief Bioinform. 2017 May 6. doi: 10.1093/bib/...</td>\n",
       "      <td>Brief Bioinform.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28481991</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28481991</td>\n",
       "      <td>create date:2017/05/10 | first author:Miotto R</td>\n",
       "      <td>&lt;div class='abstr'&gt;&lt;h3&gt;Abstract&lt;/h3&gt;&lt;div class...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>biomedical informatics; deep learning; electro...</td>\n",
       "      <td>https://www.ncbi.nlm.nih.gov//pubmed/28481991</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>2017</td>\n",
       "      <td>bae357bb-3bec-43da-8cc4-5095bc4d9bbe</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hla class i binding prediction via convolutional neural networks</th>\n",
       "      <td>/pubmed/28444127</td>\n",
       "      <td>Vang YS, Xie X.</td>\n",
       "      <td>Bioinformatics. 2017 Sep 1;33(17):2658-2665. d...</td>\n",
       "      <td>Bioinformatics.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28444127</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28444127</td>\n",
       "      <td>create date:2017/04/27 | first author:Vang YS</td>\n",
       "      <td>&lt;div class='abstr'&gt;&lt;h3&gt;Abstract&lt;/h3&gt;&lt;div class...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://www.ncbi.nlm.nih.gov//pubmed/28444127</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>2017</td>\n",
       "      <td>de37062a-9cfd-460b-9c90-e94689fefd04</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>deepcpg: accurate prediction of single-cell dna methylation states using deep learning</th>\n",
       "      <td>/pubmed/28395661</td>\n",
       "      <td>Angermueller C, Lee HJ, Reik W, Stegle O.</td>\n",
       "      <td>Genome Biol. 2017 Apr 11;18(1):67. doi: 10.118...</td>\n",
       "      <td>Genome Biol.  2017</td>\n",
       "      <td>PubMed</td>\n",
       "      <td>citation</td>\n",
       "      <td>PMID:28395661 | PMCID:PMC5387360</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>28395661</td>\n",
       "      <td>create date:2017/04/12 | first author:Angermue...</td>\n",
       "      <td>&lt;div class='abstr'&gt;&lt;h3&gt;Abstract&lt;/h3&gt;&lt;div class...</td>\n",
       "      <td>cangermueller@ebi.ac.uk</td>\n",
       "      <td>Artificial neural network; DNA methylation; De...</td>\n",
       "      <td>https://www.ncbi.nlm.nih.gov//pubmed/28395661</td>\n",
       "      <td>pubmed</td>\n",
       "      <td>2017</td>\n",
       "      <td>bb3fc9dd-5fc2-4cd5-9173-485ab5481c7e</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 URL  \\\n",
       "Title                                                                  \n",
       "deepbipolar: identifying genomic mutations for ...  /pubmed/28600868   \n",
       "precision medicine for heart failure with prese...  /pubmed/28585183   \n",
       "deep learning for healthcare: review, opportuni...  /pubmed/28481991   \n",
       "hla class i binding prediction via convolutiona...  /pubmed/28444127   \n",
       "deepcpg: accurate prediction of single-cell dna...  /pubmed/28395661   \n",
       "\n",
       "                                                                                      Description  \\\n",
       "Title                                                                                               \n",
       "deepbipolar: identifying genomic mutations for ...       Laksshman S, Bhat RR, Viswanath V, Li X.   \n",
       "precision medicine for heart failure with prese...                                       Shah SJ.   \n",
       "deep learning for healthcare: review, opportuni...  Miotto R, Wang F, Wang S, Jiang X, Dudley JT.   \n",
       "hla class i binding prediction via convolutiona...                                Vang YS, Xie X.   \n",
       "deepcpg: accurate prediction of single-cell dna...      Angermueller C, Lee HJ, Reik W, Stegle O.   \n",
       "\n",
       "                                                                                              Details  \\\n",
       "Title                                                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...  Hum Mutat. 2017 Sep;38(9):1217-1224. doi: 10.1...   \n",
       "precision medicine for heart failure with prese...  J Cardiovasc Transl Res. 2017 Jun;10(3):233-24...   \n",
       "deep learning for healthcare: review, opportuni...  Brief Bioinform. 2017 May 6. doi: 10.1093/bib/...   \n",
       "hla class i binding prediction via convolutiona...  Bioinformatics. 2017 Sep 1;33(17):2658-2665. d...   \n",
       "deepcpg: accurate prediction of single-cell dna...  Genome Biol. 2017 Apr 11;18(1):67. doi: 10.118...   \n",
       "\n",
       "                                                                      ShortDetails  \\\n",
       "Title                                                                                \n",
       "deepbipolar: identifying genomic mutations for ...                Hum Mutat.  2017   \n",
       "precision medicine for heart failure with prese...  J Cardiovasc Transl Res.  2017   \n",
       "deep learning for healthcare: review, opportuni...          Brief Bioinform.  2017   \n",
       "hla class i binding prediction via convolutiona...           Bioinformatics.  2017   \n",
       "deepcpg: accurate prediction of single-cell dna...              Genome Biol.  2017   \n",
       "\n",
       "                                                   Resource      Type  \\\n",
       "Title                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...   PubMed  citation   \n",
       "precision medicine for heart failure with prese...   PubMed  citation   \n",
       "deep learning for healthcare: review, opportuni...   PubMed  citation   \n",
       "hla class i binding prediction via convolutiona...   PubMed  citation   \n",
       "deepcpg: accurate prediction of single-cell dna...   PubMed  citation   \n",
       "\n",
       "                                                                         Identifiers  \\\n",
       "Title                                                                                  \n",
       "deepbipolar: identifying genomic mutations for ...                     PMID:28600868   \n",
       "precision medicine for heart failure with prese...  PMID:28585183 | PMCID:PMC5540576   \n",
       "deep learning for healthcare: review, opportuni...                     PMID:28481991   \n",
       "hla class i binding prediction via convolutiona...                     PMID:28444127   \n",
       "deepcpg: accurate prediction of single-cell dna...  PMID:28395661 | PMCID:PMC5387360   \n",
       "\n",
       "                                                        Db  EntrezUID  \\\n",
       "Title                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...  pubmed   28600868   \n",
       "precision medicine for heart failure with prese...  pubmed   28585183   \n",
       "deep learning for healthcare: review, opportuni...  pubmed   28481991   \n",
       "hla class i binding prediction via convolutiona...  pubmed   28444127   \n",
       "deepcpg: accurate prediction of single-cell dna...  pubmed   28395661   \n",
       "\n",
       "                                                                                           Properties  \\\n",
       "Title                                                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...  create date:2017/06/11 | first author:Laksshman S   \n",
       "precision medicine for heart failure with prese...      create date:2017/06/07 | first author:Shah SJ   \n",
       "deep learning for healthcare: review, opportuni...     create date:2017/05/10 | first author:Miotto R   \n",
       "hla class i binding prediction via convolutiona...      create date:2017/04/27 | first author:Vang YS   \n",
       "deepcpg: accurate prediction of single-cell dna...  create date:2017/04/12 | first author:Angermue...   \n",
       "\n",
       "                                                                                             abstract  \\\n",
       "Title                                                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...  <div class='abstr'><h3>Abstract</h3><div class...   \n",
       "precision medicine for heart failure with prese...  <div class='abstr'><h3>Abstract</h3><div class...   \n",
       "deep learning for healthcare: review, opportuni...  <div class='abstr'><h3>Abstract</h3><div class...   \n",
       "hla class i binding prediction via convolutiona...  <div class='abstr'><h3>Abstract</h3><div class...   \n",
       "deepcpg: accurate prediction of single-cell dna...  <div class='abstr'><h3>Abstract</h3><div class...   \n",
       "\n",
       "                                                                           email  \\\n",
       "Title                                                                              \n",
       "deepbipolar: identifying genomic mutations for ...                           NaN   \n",
       "precision medicine for heart failure with prese...  sanjiv.shah@northwestern.edu   \n",
       "deep learning for healthcare: review, opportuni...                           NaN   \n",
       "hla class i binding prediction via convolutiona...                           NaN   \n",
       "deepcpg: accurate prediction of single-cell dna...       cangermueller@ebi.ac.uk   \n",
       "\n",
       "                                                                                             keywords  \\\n",
       "Title                                                                                                   \n",
       "deepbipolar: identifying genomic mutations for ...  bipolar disorder; convolutional neural network...   \n",
       "precision medicine for heart failure with prese...  Cluster analysis; Heart failure with preserved...   \n",
       "deep learning for healthcare: review, opportuni...  biomedical informatics; deep learning; electro...   \n",
       "hla class i binding prediction via convolutiona...                                                NaN   \n",
       "deepcpg: accurate prediction of single-cell dna...  Artificial neural network; DNA methylation; De...   \n",
       "\n",
       "                                                                                          fullURL  \\\n",
       "Title                                                                                               \n",
       "deepbipolar: identifying genomic mutations for ...  https://www.ncbi.nlm.nih.gov//pubmed/28600868   \n",
       "precision medicine for heart failure with prese...  https://www.ncbi.nlm.nih.gov//pubmed/28585183   \n",
       "deep learning for healthcare: review, opportuni...  https://www.ncbi.nlm.nih.gov//pubmed/28481991   \n",
       "hla class i binding prediction via convolutiona...  https://www.ncbi.nlm.nih.gov//pubmed/28444127   \n",
       "deepcpg: accurate prediction of single-cell dna...  https://www.ncbi.nlm.nih.gov//pubmed/28395661   \n",
       "\n",
       "                                                    source  year  \\\n",
       "Title                                                              \n",
       "deepbipolar: identifying genomic mutations for ...  pubmed  2017   \n",
       "precision medicine for heart failure with prese...  pubmed  2017   \n",
       "deep learning for healthcare: review, opportuni...  pubmed  2017   \n",
       "hla class i binding prediction via convolutiona...  pubmed  2017   \n",
       "deepcpg: accurate prediction of single-cell dna...  pubmed  2017   \n",
       "\n",
       "                                                                                     key  \\\n",
       "Title                                                                                      \n",
       "deepbipolar: identifying genomic mutations for ...  c3510786-340b-45d7-9272-eee0c8960971   \n",
       "precision medicine for heart failure with prese...  640584a9-ad0c-46cf-97f9-575c47f80b99   \n",
       "deep learning for healthcare: review, opportuni...  bae357bb-3bec-43da-8cc4-5095bc4d9bbe   \n",
       "hla class i binding prediction via convolutiona...  de37062a-9cfd-460b-9c90-e94689fefd04   \n",
       "deepcpg: accurate prediction of single-cell dna...  bb3fc9dd-5fc2-4cd5-9173-485ab5481c7e   \n",
       "\n",
       "                                                    use  \n",
       "Title                                                    \n",
       "deepbipolar: identifying genomic mutations for ...    1  \n",
       "precision medicine for heart failure with prese...    1  \n",
       "deep learning for healthcare: review, opportuni...    1  \n",
       "hla class i binding prediction via convolutiona...    1  \n",
       "deepcpg: accurate prediction of single-cell dna...    1  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame.from_csv('/home/ahmed/Dropbox/DFCI/14_zoo/scrap-science/pubmed/out/pubmed_result.csv')\n",
    "print df.shape\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "for i in range(df.shape[0]):# \n",
    "    if pd.isnull(df.iloc[i].email):\n",
    "        counter+=1\n",
    "print counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "venvradiomicsio",
   "language": "python",
   "name": "venvradiomicsio"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
